install_load <- function (package1, ...) {
# convert arguments to vector
packages <- c(package1, ...)
# start loop to determine if each package is installed
for(package in packages){
# if package is installed locally, load
if(package %in% rownames(installed.packages()))
do.call('library', list(package))
#else use install.packages then load
else {
install.packages(package, repos = "http://cran.stat.unipd.it/")
do.call("library", list(package))
}
}
}
libs = c("caret", "dplyr", "VIM")
install_load(libs)
package 㤼㸱caret㤼㸲 was built under R version 3.4.3Loading required package: lattice
Loading required package: ggplot2
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
package 㤼㸱VIM㤼㸲 was built under R version 3.4.3Loading required package: colorspace
Loading required package: grid
Loading required package: data.table
data.table 1.10.4.3
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
Attaching package: 㤼㸱data.table㤼㸲
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
between, first, last
VIM is ready to use.
Since version 4.0.0 the GUI is in its own package VIMGUI.
Please use the package to use the new (and old) GUI.
Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
Attaching package: 㤼㸱VIM㤼㸲
The following object is masked from 㤼㸱package:datasets㤼㸲:
sleep
data_dir = "./data"
training_url = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_url = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
training_file = "pml-training.csv"
test_file = "pml-test.csv"
if (!file.exists(data_dir)) {
dir.create(data_dir)
}
if (!file.exists(file.path(data_dir, training_file))) {
download.file(training_url, destfile=file.path(data_dir, training_file))
}
if (!file.exists(file.path(data_dir, test_file))) {
download.file(test_url, destfile=file.path(data_dir, test_file))
}
Load the data into 2 different data frames
train <- read.csv(file.path(data_dir, training_file))
test <- read.csv(file.path(data_dir, test_file))
dim(train)
[1] 19622 160
dim(test)
[1] 20 160
head(train)
Check if in the observations are present NA values or missing OBS that can raise errors/bias during the model training.
sum(complete.cases(train))
[1] 406
Too few observation to have a correct training.
Let’s see colnames
There are columns with a lot of missing values.
We will reatain only the columns without NA values
First covert all the data in NUMERIC form to coerce the empty factor to NA
trainRaw = train[, sapply(train, is.numeric)]
testRaw = test[, sapply(test, is.numeric)]
Remove columns with NA values
trainFilter <- trainRaw[, colSums(is.na(trainRaw)) == 0]
testFilter <- testRaw[, colSums(is.na(testRaw)) == 0]
Dimension
dim(trainFilter)
[1] 19622 54
dim(testFilter)
[1] 20 54
Removing other unuseful columns like username, timestamp and ID
Get dimension of the filtered dataset
dim(trainFilter)
[1] 19622 53
dim(testFilter)
[1] 20 54
We will slice the Training data into Training and Validation set using the 80-20 rule.
set.seed(12022018) # Today's date
inTrain <- createDataPartition(trainFilter$classe, p=0.70, list=F)
trainData <- trainFilter[inTrain, ]
validationData <- trainFilter[-inTrain, ]
dim(trainData)
We will fit a model using Random Forest and XGBoost (very popular in challange like kaggle.com) for several reasons:
With tree-based models, you can safely ignore predictors correlation issues
Zero- and Near Zero-Variance Predictors does not imply on tree-based models
As each feature is processed separately, and the possible splits of the data don’t depend on scaling, no preprocessing like normalization or standardization of features is needed for decision tree algorithms.
controlRf <- trainControl(method="cv", 5)
modelRf <- train(classe ~ ., data=trainData, method="rf", trControl=controlRf, ntree=250)
modelRf
Random Forest
13737 samples
53 predictor
5 classes: 'A', 'B', 'C', 'D', 'E'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 10989, 10990, 10990, 10989, 10990
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.9919923 0.9898700
27 0.9959963 0.9949355
53 0.9927936 0.9908839
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 27.
confusionMatrix(validationData$classe, predict_rf)
Confusion Matrix and Statistics
Reference
Prediction A B C D E
A 1673 0 0 0 1
B 2 1129 8 0 0
C 0 2 1024 0 0
D 0 0 2 962 0
E 0 0 0 0 1082
Overall Statistics
Accuracy : 0.9975
95% CI : (0.9958, 0.9986)
No Information Rate : 0.2846
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9968
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: A Class: B Class: C Class: D Class: E
Sensitivity 0.9988 0.9982 0.9903 1.0000 0.9991
Specificity 0.9998 0.9979 0.9996 0.9996 1.0000
Pos Pred Value 0.9994 0.9912 0.9981 0.9979 1.0000
Neg Pred Value 0.9995 0.9996 0.9979 1.0000 0.9998
Prevalence 0.2846 0.1922 0.1757 0.1635 0.1840
Detection Rate 0.2843 0.1918 0.1740 0.1635 0.1839
Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
Balanced Accuracy 0.9993 0.9981 0.9950 0.9998 0.9995
Very accurate model to classify classe feature
modelXGB$modelInfo
$label
[1] "eXtreme Gradient Boosting"
$library
[1] "xgboost" "plyr"
$type
[1] "Regression" "Classification"
$parameters
$grid
function (x, y, len = NULL, search = "grid")
{
if (search == "grid") {
out <- expand.grid(max_depth = seq(1, len), nrounds = floor((1:len) *
50), eta = c(0.3, 0.4), gamma = 0, colsample_bytree = c(0.6,
0.8), min_child_weight = c(1), subsample = seq(0.5,
1, length = len))
}
else {
out <- data.frame(nrounds = sample(1:1000, size = len,
replace = TRUE), max_depth = sample(1:10, replace = TRUE,
size = len), eta = runif(len, min = 0.001, max = 0.6),
gamma = runif(len, min = 0, max = 10), colsample_bytree = runif(len,
min = 0.3, max = 0.7), min_child_weight = sample(0:20,
size = len, replace = TRUE), subsample = runif(len,
min = 0.25, max = 1))
out$nrounds <- floor(out$nrounds)
out <- out[!duplicated(out), ]
}
out
}
$loop
function (grid)
{
loop <- plyr::ddply(grid, c("eta", "max_depth", "gamma",
"colsample_bytree", "min_child_weight", "subsample"),
function(x) c(nrounds = max(x$nrounds)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$nrounds)) {
index <- which(grid$max_depth == loop$max_depth[i] &
grid$eta == loop$eta[i] & grid$gamma == loop$gamma[i] &
grid$colsample_bytree == loop$colsample_bytree[i] &
grid$min_child_weight == loop$min_child_weight[i] &
grid$subsample == loop$subsample[i])
trees <- grid[index, "nrounds"]
submodels[[i]] <- data.frame(nrounds = trees[trees !=
loop$nrounds[i]])
}
list(loop = loop, submodels = submodels)
}
<bytecode: 0x0000000030ee0c80>
$fit
function (x, y, wts, param, lev, last, classProbs, ...)
{
if (!inherits(x, "xgb.DMatrix"))
x <- as.matrix(x)
if (is.factor(y)) {
if (length(lev) == 2) {
y <- ifelse(y == lev[1], 1, 0)
if (!inherits(x, "xgb.DMatrix"))
x <- xgboost::xgb.DMatrix(x, label = y, missing = NA)
else xgboost::setinfo(x, "label", y)
if (!is.null(wts))
xgboost::setinfo(x, "weight", wts)
out <- xgboost::xgb.train(list(eta = param$eta, max_depth = param$max_depth,
gamma = param$gamma, colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight, subsample = param$subsample),
data = x, nrounds = param$nrounds, objective = "binary:logistic",
...)
}
else {
y <- as.numeric(y) - 1
if (!inherits(x, "xgb.DMatrix"))
x <- xgboost::xgb.DMatrix(x, label = y, missing = NA)
else xgboost::setinfo(x, "label", y)
if (!is.null(wts))
xgboost::setinfo(x, "weight", wts)
out <- xgboost::xgb.train(list(eta = param$eta, max_depth = param$max_depth,
gamma = param$gamma, colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight, subsample = param$subsample),
data = x, num_class = length(lev), nrounds = param$nrounds,
objective = "multi:softprob", ...)
}
}
else {
if (!inherits(x, "xgb.DMatrix"))
x <- xgboost::xgb.DMatrix(x, label = y, missing = NA)
else xgboost::setinfo(x, "label", y)
if (!is.null(wts))
xgboost::setinfo(x, "weight", wts)
out <- xgboost::xgb.train(list(eta = param$eta, max_depth = param$max_depth,
gamma = param$gamma, colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight, subsample = param$subsample),
data = x, nrounds = param$nrounds, objective = "reg:linear",
...)
}
out
}
<bytecode: 0x000000001a127e88>
$predict
function (modelFit, newdata, submodels = NULL)
{
if (!inherits(newdata, "xgb.DMatrix")) {
newdata <- as.matrix(newdata)
newdata <- xgboost::xgb.DMatrix(data = newdata, missing = NA)
}
out <- predict(modelFit, newdata)
if (modelFit$problemType == "Classification") {
if (length(modelFit$obsLevels) == 2) {
out <- ifelse(out >= 0.5, modelFit$obsLevels[1],
modelFit$obsLevels[2])
}
else {
out <- matrix(out, ncol = length(modelFit$obsLevels),
byrow = TRUE)
out <- modelFit$obsLevels[apply(out, 1, which.max)]
}
}
if (!is.null(submodels)) {
tmp <- vector(mode = "list", length = nrow(submodels) +
1)
tmp[[1]] <- out
for (j in seq(along = submodels$nrounds)) {
tmp_pred <- predict(modelFit, newdata, ntreelimit = submodels$nrounds[j])
if (modelFit$problemType == "Classification") {
if (length(modelFit$obsLevels) == 2) {
tmp_pred <- ifelse(tmp_pred >= 0.5, modelFit$obsLevels[1],
modelFit$obsLevels[2])
}
else {
tmp_pred <- matrix(tmp_pred, ncol = length(modelFit$obsLevels),
byrow = TRUE)
tmp_pred <- modelFit$obsLevels[apply(tmp_pred,
1, which.max)]
}
}
tmp[[j + 1]] <- tmp_pred
}
out <- tmp
}
out
}
<bytecode: 0x0000000033a1ccc8>
$prob
function (modelFit, newdata, submodels = NULL)
{
if (!inherits(newdata, "xgb.DMatrix")) {
newdata <- as.matrix(newdata)
newdata <- xgboost::xgb.DMatrix(data = newdata, missing = NA)
}
if (!is.null(modelFit$param$objective) && modelFit$param$objective ==
"binary:logitraw") {
p <- predict(modelFit, newdata)
out <- binomial()$linkinv(p)
}
else {
out <- predict(modelFit, newdata)
}
if (length(modelFit$obsLevels) == 2) {
out <- cbind(out, 1 - out)
colnames(out) <- modelFit$obsLevels
}
else {
out <- matrix(out, ncol = length(modelFit$obsLevels),
byrow = TRUE)
colnames(out) <- modelFit$obsLevels
}
out <- as.data.frame(out)
if (!is.null(submodels)) {
tmp <- vector(mode = "list", length = nrow(submodels) +
1)
tmp[[1]] <- out
for (j in seq(along = submodels$nrounds)) {
tmp_pred <- predict(modelFit, newdata, ntreelimit = submodels$nrounds[j])
if (length(modelFit$obsLevels) == 2) {
tmp_pred <- cbind(tmp_pred, 1 - tmp_pred)
colnames(tmp_pred) <- modelFit$obsLevels
}
else {
tmp_pred <- matrix(tmp_pred, ncol = length(modelFit$obsLevels),
byrow = TRUE)
colnames(tmp_pred) <- modelFit$obsLevels
}
tmp_pred <- as.data.frame(tmp_pred)
tmp[[j + 1]] <- tmp_pred
}
out <- tmp
}
out
}
$predictors
function (x, ...)
{
imp <- xgboost::xgb.importance(x$xNames, model = x)
x$xNames[x$xNames %in% imp$Feature]
}
$varImp
function (object, numTrees = NULL, ...)
{
imp <- xgboost::xgb.importance(object$xNames, model = object)
imp <- as.data.frame(imp)[, 1:2]
rownames(imp) <- as.character(imp[, 1])
imp <- imp[, 2, drop = FALSE]
colnames(imp) <- "Overall"
missing <- object$xNames[!(object$xNames %in% rownames(imp))]
missing_imp <- data.frame(Overall = rep(0, times = length(missing)))
rownames(missing_imp) <- missing
imp <- rbind(imp, missing_imp)
imp
}
$levels
function (x)
x$obsLevels
$tags
[1] "Tree-Based Model" "Boosting" "Ensemble Model" "Implicit Feature Selection"
[5] "Accepts Case Weights"
$sort
function (x)
{
x[order(x$nrounds, x$max_depth, x$eta, x$gamma, x$colsample_bytree,
x$min_child_weight), ]
}
modelXGB
eXtreme Gradient Boosting
13737 samples
53 predictor
5 classes: 'A', 'B', 'C', 'D', 'E'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 10989, 10990, 10991, 10988, 10990
Resampling results across tuning parameters:
eta max_depth colsample_bytree subsample nrounds Accuracy Kappa
0.3 1 0.6 0.50 50 0.8130571 0.7630000
0.3 1 0.6 0.50 100 0.8827974 0.8516047
0.3 1 0.6 0.50 150 0.9169393 0.8948639
0.3 1 0.6 0.75 50 0.8091988 0.7580610
0.3 1 0.6 0.75 100 0.8823600 0.8510801
0.3 1 0.6 0.75 150 0.9153361 0.8928512
0.3 1 0.6 1.00 50 0.8118200 0.7613677
0.3 1 0.6 1.00 100 0.8849089 0.8543212
0.3 1 0.6 1.00 150 0.9160647 0.8937910
0.3 1 0.8 0.50 50 0.8131297 0.7631086
0.3 1 0.8 0.50 100 0.8853448 0.8548423
0.3 1 0.8 0.50 150 0.9171566 0.8951287
0.3 1 0.8 0.75 50 0.8127663 0.7625787
0.3 1 0.8 0.75 100 0.8849078 0.8542995
0.3 1 0.8 0.75 150 0.9173753 0.8954213
0.3 1 0.8 1.00 50 0.8105822 0.7598251
0.3 1 0.8 1.00 100 0.8846168 0.8539364
0.3 1 0.8 1.00 150 0.9148997 0.8923266
0.3 2 0.6 0.50 50 0.9520267 0.9393048
0.3 2 0.6 0.50 100 0.9868237 0.9833326
0.3 2 0.6 0.50 150 0.9956321 0.9944752
0.3 2 0.6 0.75 50 0.9529015 0.9404256
0.3 2 0.6 0.75 100 0.9864598 0.9828744
0.3 2 0.6 0.75 150 0.9957781 0.9946598
0.3 2 0.6 1.00 50 0.9535563 0.9412486
0.3 2 0.6 1.00 100 0.9893720 0.9865557
0.3 2 0.6 1.00 150 0.9965060 0.9955806
0.3 2 0.8 0.50 50 0.9532645 0.9408745
0.3 2 0.8 0.50 100 0.9864599 0.9828722
0.3 2 0.8 0.50 150 0.9950499 0.9937385
0.3 2 0.8 0.75 50 0.9552299 0.9433718
0.3 2 0.8 0.75 100 0.9882070 0.9850824
0.3 2 0.8 0.75 150 0.9962876 0.9953044
0.3 2 0.8 1.00 50 0.9554475 0.9436529
0.3 2 0.8 1.00 100 0.9885712 0.9855437
0.3 2 0.8 1.00 150 0.9967244 0.9958568
0.3 3 0.6 0.50 50 0.9895178 0.9867398
0.3 3 0.6 0.50 100 0.9980349 0.9975144
0.3 3 0.6 0.50 150 0.9988355 0.9985271
0.3 3 0.6 0.75 50 0.9895175 0.9867387
0.3 3 0.6 0.75 100 0.9984714 0.9980666
0.3 3 0.6 0.75 150 0.9991993 0.9989873
0.3 3 0.6 1.00 50 0.9902450 0.9876603
0.3 3 0.6 1.00 100 0.9984715 0.9980668
0.3 3 0.6 1.00 150 0.9990538 0.9988033
0.3 3 0.8 0.50 50 0.9903184 0.9877538
0.3 3 0.8 0.50 100 0.9986170 0.9982508
0.3 3 0.8 0.50 150 0.9990538 0.9988032
0.3 3 0.8 0.75 50 0.9909005 0.9884899
0.3 3 0.8 0.75 100 0.9986898 0.9983428
0.3 3 0.8 0.75 150 0.9991266 0.9988953
0.3 3 0.8 1.00 50 0.9906091 0.9881210
0.3 3 0.8 1.00 100 0.9985443 0.9981587
0.3 3 0.8 1.00 150 0.9991265 0.9988952
0.4 1 0.6 0.50 50 0.8438511 0.8021729
0.4 1 0.6 0.50 100 0.9084947 0.8841705
0.4 1 0.6 0.50 150 0.9370297 0.9203081
0.4 1 0.6 0.75 50 0.8458887 0.8048153
0.4 1 0.6 0.75 100 0.9063094 0.8814245
0.4 1 0.6 0.75 150 0.9341907 0.9167316
0.4 1 0.6 1.00 50 0.8437768 0.8021311
0.4 1 0.6 1.00 100 0.9081301 0.8837226
0.4 1 0.6 1.00 150 0.9353556 0.9182012
0.4 1 0.8 0.50 50 0.8489455 0.8086970
0.4 1 0.8 0.50 100 0.9108959 0.8872197
0.4 1 0.8 0.50 150 0.9372487 0.9205817
0.4 1 0.8 0.75 50 0.8470530 0.8062679
0.4 1 0.8 0.75 100 0.9103863 0.8865694
0.4 1 0.8 0.75 150 0.9357930 0.9187593
0.4 1 0.8 1.00 50 0.8470534 0.8062228
0.4 1 0.8 1.00 100 0.9071108 0.8824569
0.4 1 0.8 1.00 150 0.9365201 0.9196854
0.4 2 0.6 0.50 50 0.9707360 0.9629766
0.4 2 0.6 0.50 100 0.9933758 0.9916205
0.4 2 0.6 0.50 150 0.9973067 0.9965934
0.4 2 0.6 0.75 50 0.9711006 0.9634423
0.4 2 0.6 0.75 100 0.9948316 0.9934627
0.4 2 0.6 0.75 150 0.9978164 0.9972381
0.4 2 0.6 1.00 50 0.9700804 0.9621556
0.4 2 0.6 1.00 100 0.9949771 0.9936468
0.4 2 0.6 1.00 150 0.9983987 0.9979747
0.4 2 0.8 0.50 50 0.9697170 0.9616974
0.4 2 0.8 0.50 100 0.9930116 0.9911602
0.4 2 0.8 0.50 150 0.9970155 0.9962250
0.4 2 0.8 0.75 50 0.9719013 0.9644558
0.4 2 0.8 0.75 100 0.9952686 0.9940155
0.4 2 0.8 0.75 150 0.9979620 0.9974223
0.4 2 0.8 1.00 50 0.9713175 0.9637259
0.4 2 0.8 1.00 100 0.9950500 0.9937390
0.4 2 0.8 1.00 150 0.9984715 0.9980667
0.4 3 0.6 0.50 50 0.9935216 0.9918056
0.4 3 0.6 0.50 100 0.9981074 0.9976062
0.4 3 0.6 0.50 150 0.9989809 0.9987110
0.4 3 0.6 0.75 50 0.9946857 0.9932783
0.4 3 0.6 0.75 100 0.9991265 0.9988952
0.4 3 0.6 0.75 150 0.9992721 0.9990794
0.4 3 0.6 1.00 50 0.9951957 0.9939231
0.4 3 0.6 1.00 100 0.9988354 0.9985269
0.4 3 0.6 1.00 150 0.9991266 0.9988953
0.4 3 0.8 0.50 50 0.9959234 0.9948437
0.4 3 0.8 0.50 100 0.9989081 0.9986190
0.4 3 0.8 0.50 150 0.9989809 0.9987111
0.4 3 0.8 0.75 50 0.9957054 0.9945679
0.4 3 0.8 0.75 100 0.9989811 0.9987112
0.4 3 0.8 0.75 150 0.9991994 0.9989873
0.4 3 0.8 1.00 50 0.9948317 0.9934625
0.4 3 0.8 1.00 100 0.9988355 0.9985271
0.4 3 0.8 1.00 150 0.9991994 0.9989873
Tuning parameter 'gamma' was held constant at a value of 0
Tuning parameter 'min_child_weight' was held constant at a
value of 1
Accuracy was used to select the optimal model using the largest value.
The final values used for the model were nrounds = 150, max_depth = 3, eta = 0.4, gamma = 0, colsample_bytree =
0.6, min_child_weight = 1 and subsample = 0.75.
predict_XGB <- predict(modelXGB, validationData)
confusionMatrix(validationData$classe, predict_XGB)
Confusion Matrix and Statistics
Reference
Prediction A B C D E
A 1674 0 0 0 0
B 2 1137 0 0 0
C 0 0 1026 0 0
D 0 0 0 964 0
E 0 0 0 0 1082
Overall Statistics
Accuracy : 0.9997
95% CI : (0.9988, 1)
No Information Rate : 0.2848
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9996
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: A Class: B Class: C Class: D Class: E
Sensitivity 0.9988 1.0000 1.0000 1.0000 1.0000
Specificity 1.0000 0.9996 1.0000 1.0000 1.0000
Pos Pred Value 1.0000 0.9982 1.0000 1.0000 1.0000
Neg Pred Value 0.9995 1.0000 1.0000 1.0000 1.0000
Prevalence 0.2848 0.1932 0.1743 0.1638 0.1839
Detection Rate 0.2845 0.1932 0.1743 0.1638 0.1839
Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
Balanced Accuracy 0.9994 0.9998 1.0000 1.0000 1.0000
With XGB we reach a better accuracy on validation data.
Only 2 mislabeled prediction A->B
# collect resamples
results <- resamples(list(RF=modelRf, XGB=modelXGB))
# summarize the distributions
summary(results)
Call:
summary.resamples(object = results)
Models: RF, XGB
Number of resamples: 5
Accuracy
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
RF 0.9930859 0.9959956 0.9959956 0.9959963 0.9970877 0.9978166 0
XGB 0.9985449 0.9989079 0.9996358 0.9992721 0.9996360 0.9996361 0
Kappa
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
RF 0.9912541 0.9949340 0.9949349 0.9949355 0.9963162 0.9972383 0
XGB 0.9981596 0.9986187 0.9995394 0.9990794 0.9995396 0.9995397 0
# boxplots of results
bwplot(results)
# dot plots of results
dotplot(results)
resultRf <- predict(modelRf, testFilter[, -length(names(testFilter))])
resultXGB <- predict(modelXGB, testFilter[, -length(names(testFilter))])
resultRf
[1] B A B A A E D B A A B C B A E E A B B B
Levels: A B C D E
resultXGB
[1] B A B A A E D B A A B C B A E E A B B B
Levels: A B C D E
confusionMatrix(resultRf, resultXGB)
Confusion Matrix and Statistics
Reference
Prediction A B C D E
A 7 0 0 0 0
B 0 8 0 0 0
C 0 0 1 0 0
D 0 0 0 1 0
E 0 0 0 0 3
Overall Statistics
Accuracy : 1
95% CI : (0.8316, 1)
No Information Rate : 0.4
P-Value [Acc > NIR] : 1.1e-08
Kappa : 1
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: A Class: B Class: C Class: D Class: E
Sensitivity 1.00 1.0 1.00 1.00 1.00
Specificity 1.00 1.0 1.00 1.00 1.00
Pos Pred Value 1.00 1.0 1.00 1.00 1.00
Neg Pred Value 1.00 1.0 1.00 1.00 1.00
Prevalence 0.35 0.4 0.05 0.05 0.15
Detection Rate 0.35 0.4 0.05 0.05 0.15
Detection Prevalence 0.35 0.4 0.05 0.05 0.15
Balanced Accuracy 1.00 1.0 1.00 1.00 1.00
Finally the model predict the TEST data in the same way, but we noticed that XGB works better with the trainig set